#### Read in data ####
df <- read_csv("Data/ca-local-election-data/ced-merged.csv")

#### Clean names ####
df <- df %>% 
  mutate(last = lname,
         first = fname)
df <- df %>% 
  mutate(lname = word(lname, 1),
         fname = word(fname, 1))

# Remove extra commas and period
df <- df %>% 
  mutate(fname = str_remove(fname, ","),
         lname = str_remove(lname, ",")) %>% 
  mutate(fname = str_remove(fname, "\\."),
         lname = str_remove(lname, "\\."))

# This keeps just the name AFTER the hyphen in last names
df$lname <- gsub(".*-","",df$lname)

# This keeps just the name BEFORE the hyphen in first names
df$fname <- gsub("-.*","",df$fname)

#### Repair names with only one letter for first name
df <- df %>%
  mutate(fname = case_when(nchar(df$fname) == 1 ~ word(first, 2),
                           nchar(df$fname) != 1 ~ fname))

# Remove punctuation 
df$lname <- str_replace_all(df$lname, "[^[:alnum:]]", "") 
df$fname <- str_replace_all(df$fname, "[^[:alnum:]]", "") 

# Fix names with accents
df <- df %>% 
  mutate(fname = case_when(fname == "Mónica" ~ "Monica",
                           T ~ fname),
         lname = case_when(lname == "Gómez" ~ "Gomez",
                           lname == "Durón" ~ "Duron",
                           lname == "López" ~ "Lopez",
                           T ~ lname))

df <- df %>%
  mutate(fname = str_to_title(fname),
         lname = str_to_title(lname))

#### Filtering ####
# Dropping races with only one candidate
df <- df %>% 
  filter(num_cand != 1)

# Filtering down to school board elections and city council elections
df <- df %>%
  filter(office == "SCHOOL BOARD MEMBER" |
           office == "CITY COUNCIL")

#### Impute gender probabilities ####
name_probs <- gender(df$fname,
                     years = c(1950, 1990),
                     method = 'ssa')

name_probs <- name_probs %>% 
  rename(fname = name) %>% 
  select(-year_min,
         -year_max)

df <- full_join(df, name_probs, by = 'fname') %>% 
  distinct()

rm(name_probs)

#### Surname commonality ####
census_surnames <- read.table("data/census-names/dist.all.last.txt",
                              col.names = c('surname', 'percent_freq', 'cum_freq', 'rank'))
census_surnames <- census_surnames %>%
  mutate(last_clean = tolower(surname)) %>% 
  mutate(lname = str_to_title(last_clean)) %>% 
  select(-cum_freq,
         -surname)

df <- df %>% 
  mutate(lname = str_to_title(lname),
         fname = str_to_title(fname))

df <- left_join(df, census_surnames, by = 'lname')

df$percent_freq[is.na(df$percent_freq)] <- 0

rm(census_surnames)

#### First name commonality ####
babynames <- babynames %>% 
  filter(year == 1950) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- df %>% 
  mutate(sex = case_when(gender == "male" ~ "M",
                         gender == "female" ~ "F"))

df <- left_join(df, babynames, by = c("fname", "sex")) %>% 
  rename(fprop = prop,
         fn = n)

df$fprop[is.na(df$fprop)] <- 0
df$fn[is.na(df$fn)] <- 0

rm(babynames)

babynames <- babynames %>% 
  filter(year == 1930) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- left_join(df, babynames, by = c("fname", "sex")) %>% 
  rename(fprop30 = prop,
         fn30 = n)

df$fprop30[is.na(df$fprop30)] <- 0
df$fn30[is.na(df$fn30)] <- 0

rm(babynames)

babynames <- babynames %>% 
  filter(year == 1940) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- left_join(df, babynames, by = c("fname", "sex")) %>% 
  rename(fprop40 = prop,
         fn40 = n)

df$fprop40[is.na(df$fprop40)] <- 0
df$fn40[is.na(df$fn40)] <- 0

rm(babynames)

babynames <- babynames %>% 
  filter(year == 1960) %>% 
  rename(fname = name) %>% 
  select(-year)

df <- left_join(df, babynames, by = c("fname", "sex")) %>% 
  rename(fprop60 = prop,
         fn60 = n)

df$fprop60[is.na(df$fprop60)] <- 0
df$fn60[is.na(df$fn60)] <- 0

rm(babynames)

#### Syllable and character counts ####
t <- readability(df$lname)
t <- t %>% 
  select(-re,
         -gl,
         -ari,
         -smog,
         -nonwords,
         -sents,
         -polys, 
         -words,
         -wordchars) %>% 
  rename(lchars = chars,
         lsylls = sylls,
         lcl = cl)

df <- bind_cols(df, t)

t2 <- readability(df$fname)
t2 <- t2 %>% 
  select(-re,
         -gl,
         -ari,
         -smog,
         -nonwords,
         -sents,
         -polys, 
         -words,
         -wordchars) %>% 
  rename(fchars = chars,
         fsylls = sylls,
         fcl = cl)

df <- bind_cols(df, t2)
rm(t,t2)

#### Imputing race ####
candidates <- df %>% 
  select(fname,
         lname,
         gender,
         co_name) %>% 
  mutate(surname = lname) %>% 
  mutate(state = "ca")

# Assigning county codes to counties
candidates <- candidates %>% 
  mutate(county = case_when(co_name == 'ALAMEDA'	~ '001',
                            co_name == 'KINGS'	~ '031',
                            co_name == 'PLACER' ~	'061',
                            co_name == 'SIERRA' ~	'091',
                            co_name == 'ALPINE'~	'003',
                            co_name == 'LAKE'~	'033',
                            co_name == 'PLUMAS'~ '063',
                            co_name == 'SISKIYOU'~ '093',
                            co_name ==  'AMADOR'	~'005',
                            co_name ==  'LASSEN'	~'035',
                            co_name == 'RIVERSIDE'~	'065',
                            co_name == 'SOLANO'	~'095',
                            co_name == 'BUTTE'	~'007',
                            co_name == 'LOS ANGELES'	~'037',
                            co_name ==  'SACRAMENTO'~	'067',
                            co_name ==  'SONOMA'	~'097',
                            co_name ==  'CALAVERAS'	~'009',
                            co_name == 'MADERA'~	'039',
                            co_name ==  'SAN BENITO'	~'069',
                            co_name == 'STANISLAUS'	~'099',
                            co_name == 'COLUSA'~	'011',
                            co_name == 'MARIN'~	'041',
                            co_name == 'SAN BERNARDINO'	~'071',
                            co_name == 'SUTTER'~	'101',
                            co_name ==  'CONTRA COSTA'	~'013',
                            co_name ==  'MARIPOSA'	~'043',
                            co_name == 'SAN DIEGO'	~'073',
                            co_name == 'TEHAMA'~	'103',
                            co_name ==  'DEL NORTE'	~'015',
                            co_name == 'MENDOCINO'~	'045',
                            co_name == 'SAN FRANCISCO'	~'075',
                            co_name == 'TRINITY'	~'105',
                            co_name == 'EL DORADO'~	'017',
                            co_name == 'MERCED'~	'047',
                            co_name == 'SAN JOAQUIN'~	'077',
                            co_name == 'TULARE'	~'107',
                            co_name ==  'FRESNO'	~'019',
                            co_name ==  'MODOC'~	'049',
                            co_name ==  'SAN LUIS OBISPO'~	'079',
                            co_name ==  'TUOLUMNE'~	'109',
                            co_name == 'GLENN'	~'021',
                            co_name == 'MONO'	~'051',
                            co_name == 'SAN MATEO'	~'081',
                            co_name == 'VENTURA'	~'111',
                            co_name == 'HIMBOLDT'~	'023',
                            co_name ==  'MONTEREY'	~'053',
                            co_name == 'SANTA BARBARA'	~'083',
                            co_name ==  'YOLO'~	'113',
                            co_name ==  'IMPERIAL'	~'025',
                            co_name ==  'NAPA'	~'055',
                            co_name == 'SANTA CLARA'	~'085',
                            co_name ==  'YUBA'	~'115',
                            co_name == 'INYO'~	'027',
                            co_name == 'NEVADA'	~'057',
                            co_name == 'SANTA CRUZ'~	'087',	 	 	 
                            co_name == 'KERN'	~'029',
                            co_name == 'ORANGE'~	'059',
                            co_name == 'SHASTA'~	'089')) 

# get_census_api_2("https://api.census.gov/data/2010/sf1?",
#                  '02925aecfc32b58a286e7a6287f587ead97ea2e0', get, region, retry = 0)

candidates <- candidates %>% 
  na.omit()

candidates <- candidates %>% 
  mutate(sex = case_when(gender == "female" ~ 1,
                         gender == "male" ~ 0))

x <- predict_race(candidates,
                  census.key = '02925aecfc32b58a286e7a6287f587ead97ea2e0',
                  census.geo = "county",
                  surname.year = 2000,
                  sex = F)

x <- x %>%
  select(lname,
         fname,
         gender,
         co_name,
         pred.whi,
         pred.bla,
         pred.his,
         pred.asi,
         pred.oth)

df <- full_join(df,x, by = c("lname", "fname", "gender", "co_name"))

df <- df %>% 
  distinct()

rm(candidates,x)

#### Add algorithm ratings ####
sb_first <- read_csv("data/Algorithm-ratings/SchoolBoardNamesFirstRatings.csv")
sb_last <- read_csv("data/Algorithm-ratings/SchoolBoardNamesLastRatings.csv")

df <- left_join(df,sb_first, by = "fname")
df <- left_join(df,sb_last, by = "lname")

df <- df %>% 
  mutate(first_algorithm = (first_letters + first_phonemes)/2,
         last_algorithm = (last_letters + last_phonemes)/2)

df <- df %>% 
  mutate(first_algorithm = first_algorithm*(-1),
         last_algorithm = last_algorithm*(-1),
         first_letters = first_letters*(-1),
         first_phonemes = first_phonemes*(-1),
         last_letters = last_letters*(-1),
         last_phonemes = last_phonemes*(-1))

#### Clean up model variables ####
df <- df %>% 
  mutate(incumbent = case_when(inc == 1 ~ 1,
                               inc == 2 ~ 0)) %>% 
  mutate(won_election = case_when(elected == 1 ~ 1,
                                  elected == 2 ~ 0,
                                  elected == 3 ~ 0)) %>% 
  mutate(seats_comps = num_seats/num_cand) %>% 
  select(-inc,
         -num_seats)

df <- df %>% 
  mutate(white = case_when(pred.whi > .5 ~ 1,
                           TRUE ~ 0),
         black = case_when(pred.bla > .5 ~ 1,
                           TRUE ~ 0),
         hisp = case_when(pred.his > .5 ~ 1,
                          TRUE ~ 0),
         asian = case_when(pred.asi > .5 ~ 1,
                           TRUE ~ 0),
         race_unsure = case_when(white == 0 &
                                   black == 0 &
                                   hisp == 0 &
                                   asian == 0 ~ 1,
                                 TRUE ~ 0))

df <- df %>% 
  mutate(lnfreq_lastnames = log(0.0000001 + percent_freq))

df <- df %>% 
  mutate(office = as.factor(office))

df <- df %>% 
  mutate(totvotes1000 = totvotes/1000)

df <- df %>% 
  mutate(female = case_when(gender == "female" ~ 1,
                            gender == "male" ~ 0))

df <- df %>% 
  mutate(percent = percent*100)

#### Save out data ####
write_csv(df, "data/analysis/ced-clean-analysis.csv")
rm(list = ls())
